{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "NLP Course - Week 3 Exercise Answer.ipynb",
      "version": "0.3.2",
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "id": "hmA6EzkQJ5jt",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import json\n",
        "import tensorflow as tf\n",
        "import csv\n",
        "import random\n",
        "import numpy as np\n",
        "\n",
        "from tensorflow.keras.preprocessing.text import Tokenizer\n",
        "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
        "from tensorflow.keras.utils import to_categorical\n",
        "from tensorflow.keras import regularizers\n",
        "\n",
        "\n",
        "embedding_dim = 100\n",
        "max_length = 16\n",
        "trunc_type='post'\n",
        "padding_type='post'\n",
        "oov_tok = \"<OOV>\"\n",
        "training_size=160000\n",
        "test_portion=.1\n",
        "\n",
        "corpus = []\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bM0l_dORKqE0",
        "colab_type": "code",
        "outputId": "491ba86b-f780-4355-a4be-765565a29c8c",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 204
        }
      },
      "source": [
        "\n",
        "# Note that I cleaned the Stanford dataset to remove LATIN1 encoding to make it easier for Python CSV reader\n",
        "# You can do that yourself with:\n",
        "# iconv -f LATIN1 -t UTF8 training.1600000.processed.noemoticon.csv -o training_cleaned.csv\n",
        "# I then hosted it on my site to make it easier to use in this notebook\n",
        "\n",
        "!wget --no-check-certificate \\\n",
        "    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/training_cleaned.csv \\\n",
        "    -O /tmp/training_cleaned.csv\n",
        "\n",
        "num_sentences = 0\n",
        "\n",
        "with open(\"/tmp/training_cleaned.csv\") as csvfile:\n",
        "    reader = csv.reader(csvfile, delimiter=',')\n",
        "    for row in reader:\n",
        "        list_item=[]\n",
        "        list_item.append(row[5])\n",
        "        this_label=row[0]\n",
        "        if this_label=='0':\n",
        "            list_item.append(0)\n",
        "        else:\n",
        "            list_item.append(1)\n",
        "        num_sentences = num_sentences + 1\n",
        "        corpus.append(list_item)\n",
        "\n",
        "\n"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "--2019-06-07 17:53:35--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/training_cleaned.csv\n",
            "Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.192.128, 2607:f8b0:4001:c1d::80\n",
            "Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.192.128|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 238942690 (228M) [application/octet-stream]\n",
            "Saving to: ‘/tmp/training_cleaned.csv’\n",
            "\n",
            "/tmp/training_clean 100%[===================>] 227.87M   221MB/s    in 1.0s    \n",
            "\n",
            "2019-06-07 17:53:36 (221 MB/s) - ‘/tmp/training_cleaned.csv’ saved [238942690/238942690]\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "3kxblBUjEUX-",
        "colab_type": "code",
        "outputId": "3c0227a2-e74b-4d9b-cabb-f9ee150571b1",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 68
        }
      },
      "source": [
        "print(num_sentences)\n",
        "print(len(corpus))\n",
        "print(corpus[1])\n",
        "\n",
        "# Expected Output:\n",
        "# 1600000\n",
        "# 1600000\n",
        "# [\"is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!\", 0]"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "1600000\n",
            "1600000\n",
            "[\"is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!\", 0]\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ohOGz24lsNAD",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "sentences=[]\n",
        "labels=[]\n",
        "random.shuffle(corpus)\n",
        "for x in range(training_size):\n",
        "    sentences.append(corpus[x][0])\n",
        "    labels.append(corpus[x][1])\n",
        "\n",
        "\n",
        "tokenizer = Tokenizer()\n",
        "tokenizer.fit_on_texts(sentences)\n",
        "\n",
        "word_index = tokenizer.word_index\n",
        "vocab_size=len(word_index)\n",
        "\n",
        "sequences = tokenizer.texts_to_sequences(sentences)\n",
        "padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)\n",
        "\n",
        "split = int(test_portion * training_size)\n",
        "\n",
        "test_sequences = padded[0:split]\n",
        "training_sequences = padded[split:training_size]\n",
        "test_labels = labels[0:split]\n",
        "training_labels = labels[split:training_size]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "gIrtRem1En3N",
        "colab_type": "code",
        "outputId": "4ad8401c-8dba-420d-8aee-38dac0b0839a",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        }
      },
      "source": [
        "print(vocab_size)\n",
        "print(word_index['i'])\n",
        "# Expected Output\n",
        "# 138858\n",
        "# 1"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "138858\n",
            "1\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "C1zdgJkusRh0",
        "colab_type": "code",
        "outputId": "b6edd322-8191-45e7-cb12-08921685a72f",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 204
        }
      },
      "source": [
        "# Note this is the 100 dimension version of GloVe from Stanford\n",
        "# I unzipped and hosted it on my site to make this notebook easier\n",
        "!wget --no-check-certificate \\\n",
        "    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt \\\n",
        "    -O /tmp/glove.6B.100d.txt\n",
        "embeddings_index = {};\n",
        "with open('/tmp/glove.6B.100d.txt') as f:\n",
        "    for line in f:\n",
        "        values = line.split();\n",
        "        word = values[0];\n",
        "        coefs = np.asarray(values[1:], dtype='float32');\n",
        "        embeddings_index[word] = coefs;\n",
        "\n",
        "embeddings_matrix = np.zeros((vocab_size+1, embedding_dim));\n",
        "for word, i in word_index.items():\n",
        "    embedding_vector = embeddings_index.get(word);\n",
        "    if embedding_vector is not None:\n",
        "        embeddings_matrix[i] = embedding_vector;"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "--2019-06-07 17:55:30--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt\n",
            "Resolving storage.googleapis.com (storage.googleapis.com)... 64.233.183.128, 2607:f8b0:4001:c12::80\n",
            "Connecting to storage.googleapis.com (storage.googleapis.com)|64.233.183.128|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 347116733 (331M) [text/plain]\n",
            "Saving to: ‘/tmp/glove.6B.100d.txt’\n",
            "\n",
            "/tmp/glove.6B.100d. 100%[===================>] 331.04M   160MB/s    in 2.1s    \n",
            "\n",
            "2019-06-07 17:55:33 (160 MB/s) - ‘/tmp/glove.6B.100d.txt’ saved [347116733/347116733]\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "71NLk_lpFLNt",
        "colab_type": "code",
        "outputId": "97cb88db-754f-4375-fdc3-876cd6b4fdce",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "print(len(embeddings_matrix))\n",
        "# Expected Output\n",
        "# 138859"
      ],
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "138859\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "iKKvbuEBOGFz",
        "colab": {}
      },
      "source": [
        "model = tf.keras.Sequential([\n",
        "    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),\n",
        "    tf.keras.layers.Dropout(0.2),\n",
        "    tf.keras.layers.Conv1D(64, 5, activation='relu'),\n",
        "    tf.keras.layers.MaxPooling1D(pool_size=4),\n",
        "    tf.keras.layers.LSTM(64),\n",
        "    tf.keras.layers.Dense(1, activation='sigmoid')\n",
        "])\n",
        "model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])\n",
        "model.summary()\n",
        "\n",
        "num_epochs = 50\n",
        "history = model.fit(training_sequences, training_labels, epochs=num_epochs, validation_data=(test_sequences, test_labels), verbose=2)\n",
        "\n",
        "print(\"Training Complete\")\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "qxju4ItJKO8F",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import matplotlib.image  as mpimg\n",
        "import matplotlib.pyplot as plt\n",
        "\n",
        "#-----------------------------------------------------------\n",
        "# Retrieve a list of list results on training and test data\n",
        "# sets for each training epoch\n",
        "#-----------------------------------------------------------\n",
        "acc=history.history['acc']\n",
        "val_acc=history.history['val_acc']\n",
        "loss=history.history['loss']\n",
        "val_loss=history.history['val_loss']\n",
        "\n",
        "epochs=range(len(acc)) # Get number of epochs\n",
        "\n",
        "#------------------------------------------------\n",
        "# Plot training and validation accuracy per epoch\n",
        "#------------------------------------------------\n",
        "plt.plot(epochs, acc, 'r')\n",
        "plt.plot(epochs, val_acc, 'b')\n",
        "plt.title('Training and validation accuracy')\n",
        "plt.xlabel(\"Epochs\")\n",
        "plt.ylabel(\"Accuracy\")\n",
        "plt.legend([\"Accuracy\", \"Validation Accuracy\"])\n",
        "\n",
        "plt.figure()\n",
        "\n",
        "#------------------------------------------------\n",
        "# Plot training and validation loss per epoch\n",
        "#------------------------------------------------\n",
        "plt.plot(epochs, loss, 'r')\n",
        "plt.plot(epochs, val_loss, 'b')\n",
        "plt.title('Training and validation loss')\n",
        "plt.xlabel(\"Epochs\")\n",
        "plt.ylabel(\"Loss\")\n",
        "plt.legend([\"Loss\", \"Validation Loss\"])\n",
        "\n",
        "plt.figure()\n",
        "\n",
        "\n",
        "# Expected Output\n",
        "# A chart where the validation loss does not increase sharply!"
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}